MLMs

Intro and Data Read-In

This is a quick r script I wrote to merge the phenotype files and then analyze.

library(ggplot2)
library(tidyverse)
library(asreml)
library(psych)
library(knitr)
library(GGally)
## Registered S3 method overwritten by 'GGally':
##   method from   
##   +.gg   ggplot2
## 
## Attaching package: 'GGally'
## The following object is masked from 'package:emmeans':
## 
##     pigs
setwd("C:/Users/zwinn/OneDrive/University of Arkansas/Post Graduation Work")

hgawn<-read.csv("HGAWN_Aligned_Data.csv")

hgawn<-hgawn %>% 
  filter(!Variety=="6-Row-Barley") %>%
  select(Tray, Rep, Variety, 
         Days_to_heading, Date_of_anther_sampling, 
         Area_of_anthers_per_spike, Number_of_anthers_per_spike,
         Area_per_anther)

colnames(hgawn)[4:5]<-c("Heading_Date", "Anthesis_Date")

hgawn[,1:3]<-lapply(hgawn[,1:3], as.factor)
hgawn[,4:ncol(hgawn)]<-lapply(hgawn[,4:ncol(hgawn)], as.numeric)
lapply(hgawn[,1:ncol(hgawn)], class)
## $Tray
## [1] "factor"
## 
## $Rep
## [1] "factor"
## 
## $Variety
## [1] "factor"
## 
## $Heading_Date
## [1] "numeric"
## 
## $Anthesis_Date
## [1] "numeric"
## 
## $Area_of_anthers_per_spike
## [1] "numeric"
## 
## $Number_of_anthers_per_spike
## [1] "numeric"
## 
## $Area_per_anther
## [1] "numeric"
head(hgawn)
##   Tray Rep      Variety Heading_Date Anthesis_Date Area_of_anthers_per_spike
## 1    1   1   NC05-21642          116           119                     15.04
## 2    1   1 LA01110D-251          116           121                      3.59
## 3    1   1  AR06012-6-3          116           120                     13.72
## 4    1   1    TX13D5217          122            NA                        NA
## 5    1   1    VA12W-102          117           120                      8.09
## 6    1   1    AR04119-3          114            NA                        NA
##   Number_of_anthers_per_spike Area_per_anther
## 1                       33.25       0.4523308
## 2                        7.25       0.4951724
## 3                       29.00       0.4731034
## 4                          NA              NA
## 5                       16.00       0.5056250
## 6                          NA              NA

Linear Models

Here I loop through the to perform the following mixed linear model in ASReml:

\(y=V+R+\varepsilon\)

Where y is the response, V is the variety which is fixed, R is the rep which is the blocking factor and random, an \(\varepsilon\) is the residual error.

#set up traits
traits<-colnames(hgawn[4:ncol(hgawn)])

#create dataframe to bind onto
preds<-data.frame(Variety=unique(hgawn$Variety))

#create dataframe for AIC, BIC, and LogLik
crit<-data.frame(row.names = c("AIC", "BIC", "LogLik"))

#look at distribution
for (i in traits){
  hist(hgawn[,i], main = paste("Distribution of", i), xlab = i)
  boxplot(hgawn[,i], main=i)
}

#loop MLMs for each trait and pull BLUEs
for (i in traits){
  #announce step
  print(paste("------- Analyzing trait", i, "-------"))
  
  #run model
  mlm<-asreml(fixed=hgawn[,i]~1+Variety,
              random=~Rep,
              units=~idv(units),
              data=hgawn)
  
  #display anova table
  print(wald.asreml(mlm))
  
  #look at residuals
  print(plot(resid(mlm), main=paste("Residuals of", i, "Analysis")))
  
  #pull out fit criteria
  c<-as.data.frame(rbind(summary.asreml(mlm)$aic, summary.asreml(mlm)$bic, summary.asreml(mlm)$loglik))
  colnames(c)<-i
  crit<-cbind(crit,c)
  
  #Pull out fixed effects
  p<-predict(mlm, classify = "Variety")$pvals[,1:2]
  colnames(p)[2]<-i
  preds<-left_join(preds, p)
  
  #remove
  remove(mlm,c,p)
}
## [1] "------- Analyzing trait Heading_Date -------"
## License check Thu Mar 11 14:58:41 2021
## Model fitted using the gamma parameterization.
## ASReml 4.1.0 Thu Mar 11 14:58:42 2021
##           LogLik        Sigma2     DF     wall    cpu
##  1      -583.966       1.47951    572 14:58:42    0.1
##  2      -583.796       1.47814    572 14:58:42    0.0
##  3      -583.625       1.47650    572 14:58:42    0.0
##  4      -583.519       1.47496    572 14:58:42    0.0
##  5      -583.503       1.47437    572 14:58:42    0.0
##  6      -583.503       1.47425    572 14:58:42    0.0
## Wald tests for fixed effects.
## Response: hgawn[, i]
## Terms added sequentially; adjusted for those above.
## 
##                Df Sum of Sq Wald statistic Pr(Chisq)    
## (Intercept)     1     89918          60992 < 2.2e-16 ***
## Variety       580      1761           1195 < 2.2e-16 ***
## residual (MS)             1                             
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## NULL
## Model fitted using the gamma parameterization.
## ASReml 4.1.0 Thu Mar 11 14:58:43 2021
##           LogLik        Sigma2     DF     wall    cpu
##  1      -583.503       1.47425    572 14:58:43    0.1
##  2      -583.503       1.47425    572 14:58:43    0.0
##  3      -583.503       1.47425    572 14:58:43    0.3
## Joining, by = "Variety"

## [1] "------- Analyzing trait Anthesis_Date -------"
## Model fitted using the gamma parameterization.
## ASReml 4.1.0 Thu Mar 11 14:58:43 2021
##           LogLik        Sigma2     DF     wall    cpu
##  1      -309.475       1.19216    341 14:58:43    0.1
##  2      -309.453       1.19258    341 14:58:43    0.0
##  3      -309.442       1.19354    341 14:58:43    0.0
##  4      -309.441       1.19331    341 14:58:43    0.0
## Wald tests for fixed effects.
## Response: hgawn[, i]
## Terms added sequentially; adjusted for those above.
## 
##                Df Sum of Sq Wald statistic Pr(Chisq)    
## (Intercept)     1    424528         355756 < 2.2e-16 ***
## Variety       492       928            778 3.109e-15 ***
## residual (MS)             1                             
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## NULL
## Model fitted using the gamma parameterization.
## ASReml 4.1.0 Thu Mar 11 14:58:43 2021
##           LogLik        Sigma2     DF     wall    cpu
##  1      -309.441       1.19330    341 14:58:44    0.1
##  2      -309.441       1.19330    341 14:58:44    0.0
##  3      -309.441       1.19330    341 14:58:44    0.2
## Joining, by = "Variety"
## [1] "------- Analyzing trait Area_of_anthers_per_spike -------"
## Model fitted using the gamma parameterization.
## ASReml 4.1.0 Thu Mar 11 14:58:44 2021
##           LogLik        Sigma2     DF     wall    cpu
##  1      -818.892       23.9949    340 14:58:44    0.1 (1 restrained)
##  2      -818.455       24.1021    340 14:58:44    0.0
##  3      -818.385       24.0683    340 14:58:44    0.0
##  4      -818.378       24.0567    340 14:58:44    0.0
## Warning in asreml(fixed = hgawn[, i] ~ 1 + Variety, random = ~Rep, units =
## ~idv(units), : Some components changed by more than 1% on the last iteration.

## Wald tests for fixed effects.
## Response: hgawn[, i]
## Terms added sequentially; adjusted for those above.
## 
##                Df Sum of Sq Wald statistic Pr(Chisq)    
## (Intercept)     1    9913.3         412.08 < 2.2e-16 ***
## Variety       494   15322.2         636.92 1.389e-05 ***
## residual (MS)          24.1                             
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## NULL
## Model fitted using the gamma parameterization.
## ASReml 4.1.0 Thu Mar 11 14:58:44 2021
##           LogLik        Sigma2     DF     wall    cpu
##  1      -818.377       24.0549    340 14:58:44    0.1
##  2      -818.377       24.0549    340 14:58:44    0.0
##  3      -818.377       24.0548    340 14:58:45    0.2
## Joining, by = "Variety"

## [1] "------- Analyzing trait Number_of_anthers_per_spike -------"
## Model fitted using the gamma parameterization.
## ASReml 4.1.0 Thu Mar 11 14:58:45 2021
##           LogLik        Sigma2     DF     wall    cpu
##  1     -1053.386       95.3181    340 14:58:45    0.1 (1 restrained)
##  2     -1052.790       95.6540    340 14:58:45    0.0
##  3     -1052.770       95.5823    340 14:58:45    0.0
##  4     -1052.769       95.5661    340 14:58:45    0.0
## Wald tests for fixed effects.
## Response: hgawn[, i]
## Terms added sequentially; adjusted for those above.
## 
##                Df Sum of Sq Wald statistic Pr(Chisq)    
## (Intercept)     1     63488         664.33 < 2.2e-16 ***
## Variety       492     58067         607.61 0.0002784 ***
## residual (MS)            96                             
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## NULL
## Model fitted using the gamma parameterization.
## ASReml 4.1.0 Thu Mar 11 14:58:45 2021
##           LogLik        Sigma2     DF     wall    cpu
##  1     -1052.769       95.5652    340 14:58:45    0.1
##  2     -1052.769       95.5652    340 14:58:45    0.0
##  3     -1052.769       95.5652    340 14:58:45    0.2
## Joining, by = "Variety"

## [1] "------- Analyzing trait Area_per_anther -------"
## Model fitted using the gamma parameterization.
## ASReml 4.1.0 Thu Mar 11 14:58:45 2021
##           LogLik        Sigma2     DF     wall    cpu
##  1       598.832    0.00573189    340 14:58:45    0.1 (1 restrained)
##  2       599.481    0.00575030    340 14:58:45    0.0
##  3       599.490    0.00574741    340 14:58:45    0.0
##  4       599.490    0.00574694    340 14:58:45    0.0
## Wald tests for fixed effects.
## Response: hgawn[, i]
## Terms added sequentially; adjusted for those above.
## 
##                Df Sum of Sq Wald statistic Pr(Chisq)    
## (Intercept)     1    36.386         6331.4 < 2.2e-16 ***
## Variety       491     7.467         1299.3 < 2.2e-16 ***
## residual (MS)         0.006                             
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## NULL
## Model fitted using the gamma parameterization.
## ASReml 4.1.0 Thu Mar 11 14:58:46 2021
##           LogLik        Sigma2     DF     wall    cpu
##  1       599.490    0.00574693    340 14:58:46    0.1
##  2       599.490    0.00574693    340 14:58:46    0.0
##  3       599.490    0.00574693    340 14:58:46    0.2
## Joining, by = "Variety"

#look at the fit criteria
kable(crit, caption="Fit Criteria of RCBD for Each Trait")
Fit Criteria of RCBD for Each Trait
Heading_Date Anthesis_Date Area_of_anthers_per_spike Number_of_anthers_per_spike Area_per_anther
AIC 1171.0050 622.8826 1640.7552 2109.539 -1194.9796
BIC 1179.7033 630.5464 1648.4131 2117.197 -1187.3217
LogLik -583.5025 -309.4413 -818.3776 -1052.769 599.4898
#write out predictions
write.csv(preds,"BLUEs_2018_2019_HGAWN.csv", row.names = F)

#remove outlier for correlation study
preds<-preds[-as.numeric(rownames(preds)[which.max(preds$Area_of_anthers_per_spike)]),]

#make a pretty figure with no underscore
fig<-preds
colnames(fig)<-c("Variety","HD","AD","AOAPS","NOAPS","APA")

#display the raw values of the correlation
kable(cor(fig[,2:ncol(fig)], use="complete.obs"), caption="Correlations Among BLUEs")
Correlations Among BLUEs
HD AD AOAPS NOAPS APA
HD 1.0000000 0.7312329 -0.0820714 -0.1639121 0.1454097
AD 0.7312329 1.0000000 -0.2889538 -0.3971774 0.1214169
AOAPS -0.0820714 -0.2889538 1.0000000 0.8968776 0.4666715
NOAPS -0.1639121 -0.3971774 0.8968776 1.0000000 0.0659067
APA 0.1454097 0.1214169 0.4666715 0.0659067 1.0000000
#figure for publication
a<-ggpairs(fig[,2:ncol(fig)],
        lower = list(continous=wrap("smooth", color="blue")),
        diag =  list(continous="barDiag"),
        upper =  list(continous="cor")) 

a+theme(legend.position = "none", 
        panel.grid.major = element_blank(), 
        axis.ticks.x = element_blank(), 
        panel.border = element_rect(linetype = "dashed", colour = "black", fill = NA))
## Warning: Removed 16 rows containing non-finite values (stat_density).
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 104 rows containing missing values
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 102 rows containing missing values
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 104 rows containing missing values
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 105 rows containing missing values
## Warning: Removed 104 rows containing missing values (geom_point).
## Warning: Removed 104 rows containing non-finite values (stat_density).
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 105 rows containing missing values
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 104 rows containing missing values
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 105 rows containing missing values
## Warning: Removed 102 rows containing missing values (geom_point).
## Warning: Removed 105 rows containing missing values (geom_point).
## Warning: Removed 102 rows containing non-finite values (stat_density).
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 105 rows containing missing values

## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 105 rows containing missing values
## Warning: Removed 104 rows containing missing values (geom_point).

## Warning: Removed 104 rows containing missing values (geom_point).
## Warning: Removed 105 rows containing missing values (geom_point).
## Warning: Removed 104 rows containing non-finite values (stat_density).
## Warning in ggally_statistic(data = data, mapping = mapping, na.rm = na.rm, :
## Removed 105 rows containing missing values
## Warning: Removed 105 rows containing missing values (geom_point).

## Warning: Removed 105 rows containing missing values (geom_point).

## Warning: Removed 105 rows containing missing values (geom_point).

## Warning: Removed 105 rows containing missing values (geom_point).
## Warning: Removed 105 rows containing non-finite values (stat_density).

PCA Analysis

#create a correlation matrix
S<-cor(fig[,2:ncol(fig)], use = "complete.obs")

#perform eigen value decomposition on the correlation matrix
V<-eigen(S)$vectors

#use prcomp to perform PCA
pca_S<-prcomp(V)

#summary of PCA
summary(pca_S)
## Importance of components:
##                         PC1  PC2  PC3  PC4       PC5
## Standard deviation     0.50 0.50 0.50 0.50 1.277e-17
## Proportion of Variance 0.25 0.25 0.25 0.25 0.000e+00
## Cumulative Proportion  0.25 0.50 0.75 1.00 1.000e+00
#pull loadings
load<-pca_S$rotation

#rename the rownames
rownames(load)<-rownames(S)

#change to dataframe
load<-as.data.frame(load)

#change point size
update_geom_defaults("point",list(size=4))

#2D plot loadings
ggplot(data = load, aes(x=PC1, y=PC2, color=rownames(load)))+
  geom_point()+
  theme(legend.title=element_blank())

#library
library(plotly)
## 
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## The following object is masked from 'package:stats':
## 
##     filter
## The following object is masked from 'package:graphics':
## 
##     layout
#3D plot of loadings
a<-plot_ly(data = load, x=~PC1, y=~PC2, z=~PC3, color = ~rownames(load)) %>%
   add_markers()
a